From b9ef5137a5bff3017c73a1c575b67ba58f1cfa87 Mon Sep 17 00:00:00 2001 From: "iap10@labyrinth.cl.cam.ac.uk" Date: Fri, 3 Sep 2004 20:44:22 +0000 Subject: [PATCH] bitkeeper revision 1.1159.69.3 (4138d7a65FvXU3lh0Vx8Nsl4KhPxGw) Fix potential security hole in writeable pagetable implementation: We wern't ensuring that that L1 pages' VA backpointer is immutable after the backpointer is initialised when the page first becomes linked into a pagetable. The backpointer can only be released after the type count drops to zero (or 1 if the page is pinned). In summary: We now ensure that if an L1 page is used in multiple pagetables it must be at the same virtual address in all of them, and that L1 pages can only be used once in any given pagetable. None of these extra rules should be a problem for any OS. --- xen/arch/x86/domain.c | 3 ++ xen/arch/x86/memory.c | 97 +++++++++++++++------------------------- xen/arch/x86/traps.c | 4 +- xen/include/asm-x86/mm.h | 83 +++++++++++++++++++++++++++++++--- 4 files changed, 119 insertions(+), 68 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 3031d5de69..478daea09a 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -745,6 +745,9 @@ int construct_dom0(struct domain *p, { page->u.inuse.type_info &= ~PGT_type_mask; page->u.inuse.type_info |= PGT_l1_page_table; + page->u.inuse.type_info |= + ((v_start>>L2_PAGETABLE_SHIFT)+(count-1))<u.inuse.type_info &= ~PGT_va_mask; - page->u.inuse.type_info |= va_idx << PGT_va_shift; -} - - /* * We allow an L2 tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counst and access permissions: @@ -463,8 +455,10 @@ get_page_from_l1e( /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ static int get_page_from_l2e( - l2_pgentry_t l2e, unsigned long pfn, struct domain *d) + l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned long va_idx) { + int rc; + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) return 1; @@ -475,8 +469,11 @@ get_page_from_l2e( return 0; } - if ( unlikely(!get_page_and_type_from_pagenr( - l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, d)) ) + rc = get_page_and_type_from_pagenr( + l2_pgentry_to_pagenr(l2e), + PGT_l1_page_table | (va_idx<> PAGE_SHIFT, i); } #if defined(__i386__) @@ -674,11 +670,10 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) return update_l2e(pl2e, ol2e, nl2e); - if ( unlikely(!get_page_from_l2e(nl2e, pfn, current)) ) + if ( unlikely(!get_page_from_l2e(nl2e, pfn, current, + ((unsigned long) + pl2e & ~PAGE_MASK) >> 2 )) ) return 0; - - set_l1_page_va(l2_pgentry_val(nl2e) >> PAGE_SHIFT, - ((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2); if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) { @@ -833,10 +828,20 @@ static int do_extended_command(unsigned long ptr, unsigned long val) { case MMUEXT_PIN_L1_TABLE: case MMUEXT_PIN_L2_TABLE: + + /* When we pin an L1 page we now insist that the va + backpointer (used for writable page tables) must still be + mutable. This is an additional restriction even for guests + that don't use writable page tables, but I don't think it + will break anything as guests typically pin pages before + they are used, hence they'll still be mutable. */ + okay = get_page_and_type_from_pagenr( pfn, - (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table, + ((cmd==MMUEXT_PIN_L2_TABLE) ? + PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) , FOREIGNDOM); + if ( unlikely(!okay) ) { MEM_LOG("Error while pinning pfn %08lx", pfn); @@ -894,8 +899,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) /* * Note that we tick the clock /after/ dropping the old base's * reference count. If the page tables got freed then this will - * avoid unnecessary TLB flushes when the pages are reused. - */ + * avoid unnecessary TLB flushes when the pages are reused. */ tlb_clocktick(); } else @@ -1230,7 +1234,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) switch ( (page->u.inuse.type_info & PGT_type_mask) ) { case PGT_l1_page_table: - if ( likely(get_page_type(page, PGT_l1_page_table)) ) + if ( likely(passive_get_page_type(page, PGT_l1_page_table)) ) { okay = mod_l1_entry((l1_pgentry_t *)va, mk_l1_pgentry(req.val)); @@ -1623,9 +1627,11 @@ int ptwr_do_page_fault(unsigned long addr) PTWR_PRINTK(("get user %p for va %08lx\n", &linear_pg_table[addr>>PAGE_SHIFT], addr)); #endif + + /* Testing for page_present in the L2 avoids lots of unncessary fixups */ if ( (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) & - _PAGE_PRESENT) && - (__get_user(pte, (unsigned long *) + _PAGE_PRESENT) && + (__get_user(pte, (unsigned long *) &linear_pg_table[addr >> PAGE_SHIFT]) == 0) ) { pfn = pte >> PAGE_SHIFT; @@ -1650,6 +1656,7 @@ int ptwr_do_page_fault(unsigned long addr) if ( l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn ) { + /* this L1 is not in the current address space */ l1_pgentry_t *pl1e; PTWR_PRINTK(("[I] freeing l1 page %p taf %08x/%08x\n", page, page->u.inuse.type_info, @@ -1715,36 +1722,6 @@ int ptwr_do_page_fault(unsigned long addr) return 0; } -static void ptwr_init_backpointers(void) -{ - struct pfn_info *page; - unsigned long pde; - int va_idx; - - for ( va_idx = 0; va_idx < DOMAIN_ENTRIES_PER_L2_PAGETABLE; va_idx++ ) - { - /* check if entry valid */ - pde = l2_pgentry_val(linear_l2_table[va_idx]); - if ( (pde & _PAGE_PRESENT) == 0 ) - continue; - - page = &frame_table[pde >> PAGE_SHIFT]; - /* assert that page is an l1_page_table XXXcl maybe l2? */ - if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { - MEM_LOG("ptwr: Inconsistent pagetable: pde %lx not an l1 page\n", - pde >> PAGE_SHIFT); - domain_crash(); - } - page->u.inuse.type_info &= ~PGT_va_mask; - page->u.inuse.type_info |= va_idx << PGT_va_shift; - } -} - -static void ptwr_disable(void) -{ - __cleanup_writable_pagetable(PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE); -} - #ifndef NDEBUG void ptwr_status(void) { diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 24f1d401d2..d488a252f2 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -330,9 +330,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) return; /* successfully copied the mapping */ } - if ( unlikely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) { - if ( (addr >> L2_PAGETABLE_SHIFT) == ptwr_info[cpu].disconnected ) + if ( unlikely((addr >> L2_PAGETABLE_SHIFT) == ptwr_info[cpu].disconnected )) { ptwr_reconnect_disconnected(addr); return; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 7bbe5fe06d..05813d64b7 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -74,6 +74,7 @@ struct pfn_info /* 10-bit most significant bits of va address if used as l1 page table */ #define PGT_va_shift 18 #define PGT_va_mask (((1<<10)-1)<count_info)) ) + { + /* if the page is pinned, but we're dropping the last reference + then make the va backpointer mutable again */ + nx |= PGT_va_mutable; + } } while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); } @@ -217,22 +225,35 @@ static inline int get_page_type(struct pfn_info *page, u32 type) } else if ( unlikely((x & PGT_count_mask) == 0) ) { - if ( (x & PGT_type_mask) != type ) + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) { - nx &= ~(PGT_type_mask | PGT_validated); + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); nx |= type; /* No extra validation needed for writable pages. */ - if ( type == PGT_writable_page ) + if ( (type & PGT_type_mask) == PGT_writable_page ) nx |= PGT_validated; } } - else if ( unlikely((x & PGT_type_mask) != type) ) + else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) { DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", x & PGT_type_mask, type, page_to_pfn(page)); return 0; } - else if ( unlikely(!(x & PGT_validated)) ) + else if ( (x & PGT_va_mask) == PGT_va_mutable ) + { + /* The va_backpointer is currently mutable, hence we update it. */ + nx &= ~PGT_va_mask; + nx |= type; /* we know the actual type is correct */ + } + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) ) + { + /* The va backpointer wasn't mutable, and is different :-( */ + DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n", + x, type, page_to_pfn(page)); + return 0; + } + else if ( unlikely(!(x & PGT_validated)) ) { /* Someone else is updating validation of this page. Wait... */ while ( (y = page->u.inuse.type_info) != x ) @@ -248,7 +269,7 @@ static inline int get_page_type(struct pfn_info *page, u32 type) if ( unlikely(!(nx & PGT_validated)) ) { /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type)) ) + if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) { DPRINTK("Error while validating pfn %08lx for type %08x." " caf=%08x taf=%08x\n", @@ -258,12 +279,62 @@ static inline int get_page_type(struct pfn_info *page, u32 type) put_page_type(page); return 0; } + set_bit(_PGT_validated, &page->u.inuse.type_info); } return 1; } +/* This 'passive' version of get_page_type doesn't attempt to validate +the page, but just checks the type and increments the type count. The +function is called while doing a NORMAL_PT_UPDATE of an entry in an L1 +page table: We want to 'lock' the page for the brief beriod while +we're doing the update, but we're not actually linking it in to a +pagetable. */ + +static inline int passive_get_page_type(struct pfn_info *page, u32 type) +{ + u32 nx, x, y = page->u.inuse.type_info; + again: + do { + x = y; + nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); + return 0; + } + else if ( unlikely((x & PGT_count_mask) == 0) ) + { + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) + { + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); + nx |= type; + } + } + else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) + { + DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) != x ) + { + rep_nop(); + barrier(); + } + goto again; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + + return 1; +} + static inline void put_page_and_type(struct pfn_info *page) { -- 2.30.2